import urllib.request
import re

index_page = "https://www.x9itan.com/133/133122/"
novel_name = '炼气3000层,开局收女帝为徒'
out_file = novel_name + '.txt'

with open(out_file, 'x', encoding='utf8') as fp:
    fp.write(novel_name + "\n")
    fp.write(index_page + "\n\n")


response = urllib.request.urlopen(index_page)
# html = response.read().decode('utf-8')
html = response.read().decode("gbk")

chapter_info_regex = r'<li><a href=\"(?P<chapter_url>\d+\.html)\">(?P<chapter_name>.+)</a></li>'

chapter_infos = re.findall(chapter_info_regex, html)

total_chapter = str(len(chapter_infos))

current_chapter = 0

for chapter_info in chapter_infos:
    current_chapter += 1
    chapter_full_url = index_page + chapter_info[0]
    chapter_name = chapter_info[1]
    response = urllib.request.urlopen(chapter_full_url)
    html = response.read().decode('gbk')
    content_regex = r'(?:&nbsp;)+(?P<line>.+?)(?:\<br\s*/\>|&nbsp;|\</p\>)'
    lines = re.findall(content_regex, html)
    with open(out_file, 'a', encoding='utf8') as fp:
        fp.write("\n\n" + chapter_name + "\n")
        for line in lines:
            fp.write(line + "\n")
    print( str(current_chapter) + '/' + total_chapter + '\t' + chapter_name + "finished")






